Note: Grading is based both on your graphs and verbal explanations. Follow all best practices as discussed in class, including choosing appropriate parameters for all graphs. Do not expect the assignment questions to spell out precisely how the graphs should be drawn. Sometimes guidance will be provided, but the absense of guidance does not mean that all choices are ok.
IMPORTANT: THIS TEMPLATE DOES NOT INCLUDE THE SAMPLE GRAPHS THAT APPEAR IN THE .HTML VERSION OF THE ASSIGNMENT SO BE SURE TO VIEW THAT FILE AS WELL.
mycars missing patternsCreate a missing values plot for the mycars dataset created below (slightly different from the one in the lecture slides). Your plot should be in the style of extracat::visna() (no longer available on CRAN) using ggplot2 to create the main plot and two side plots and patchwork to put them together. It should show accurately: missing patterns, counts for missing by column and missing by pattern, and make it clear which row respresents complete cases. Bars in the side plots should be sorted and correspond to the rows and columns of the main plot. An example is provided though the aesthetics of your plot do not have to conform precisely to the example. Some code is provided to get you started with finding missing patterns. (Keep in mind that in the next question you will be turning this code into a function so to save yourself time later on write as generically as possible.)
library(tidyverse)
library(patchwork)
# Add NAs to mtcars dataset
set.seed(5702)
mycars <- mtcars
mycars[1:25, "gear"] <- NA
mycars[10:20, 3:5] <- NA
for (i in 1:10) mycars[sample(32,1), sample(11,1)] <- NA
We have written codes without using scale_alpha_manual and only using scale_fill_manual so that we can preserve scale_alpha_manual for some unknown additional modification needs in the future. Sorry about the redundant codes.
library(ggnewscale)
mycars_miss <- data.frame(is.na(mycars)) %>%
rownames_to_column("id")
mycars_miss_count <- mycars_miss %>%
group_by(across(colnames(mycars_miss)[-1])) %>%
summarize(miss_count2=n()) %>%
arrange(-miss_count2) %>%
rowid_to_column() %>%
pivot_longer(!c(rowid,miss_count2), names_to = "key", values_to = "missingFLG") %>%
mutate(missing = ifelse(missingFLG=="TRUE", 1, 0)) %>%
mutate(rowid2_fct=factor(rowid)) %>%
rename(rowid2=rowid) %>%
rowid_to_column() %>%
group_by(key) %>%
mutate(miss_count=sum(miss_count2*missing)) %>%
mutate(rowid=min(rowid)) %>%
ungroup() %>%
group_by(rowid2_fct) %>%
mutate(compcases = ifelse(sum(missing)==0, 1, 0)) %>%
ungroup() %>%
mutate(key=factor(key)) %>%
mutate(key=fct_reorder(key,-miss_count-0.5/rowid)) %>%
mutate(mod_missing=factor(missing-compcases)) %>%
mutate(mod_missing=fct_reorder(mod_missing,missing-compcases)) %>%
mutate(rowid2_fct=fct_reorder(rowid2_fct,miss_count2+0.5/rowid2))
mycars_miss_count_g1 <- mycars_miss_count %>%
group_by(key) %>%
summarize(miss_count=max(miss_count))
ymax_g1 <- max(mycars_miss_count_g1$miss_count)
g1 <- ggplot(mycars_miss_count_g1,aes(x = key, y = miss_count))+
geom_bar(stat = "identity", fill="cornflowerblue", alpha = 0.7)+
labs(x = "", y= "num rows \n missing:")+
scale_y_continuous(breaks=seq(0,ymax_g1,ceiling(ymax_g1/30)*10))+
theme_bw()+
theme(panel.grid.major.x = element_blank(),
panel.grid.minor = element_line(colour = "grey97", size = 0.05))+
ggtitle("Missing value patterns")+
theme(plot.title = element_text(size = 18))
mycars_miss_count_g2 <- mycars_miss_count %>%
group_by(rowid2_fct) %>%
summarize(miss_count2=max(miss_count2),compcases=min(compcases)) %>%
mutate(compcases_fct=factor(compcases)) %>%
mutate(compcases_fct=fct_reorder(compcases_fct,compcases))
ymax_g2 <- max(mycars_miss_count_g2$miss_count2)
g2 <- ggplot(mycars_miss_count_g2,aes(x = rowid2_fct, y = miss_count2))+
geom_bar(stat = "identity", aes(alpha = compcases_fct), fill="cornflowerblue")+
scale_alpha_manual(values = c(0.7,1), guide="none")+
labs(x = "", y= "row count")+
scale_y_continuous(breaks=seq(0,ymax_g2,ceiling(ymax_g2/15)*5))+
theme_bw() +
theme(panel.grid.major.y = element_blank(),
panel.grid.minor = element_line(colour = "grey97", size = 0.05))+
coord_flip()
g3 <- ggplot(mycars_miss_count, aes(x = key,y = rowid2_fct))+
geom_tile(aes(fill = mod_missing), color = "white")+
labs(x = "variable", y= "missing pattern")+
theme_classic()
if(max(mycars_miss_count$compcases)==1){
x_textloc_g3 <- length(unique(mycars_miss_count$key))/2+0.5
y_textloc_g3 <- unique(mycars_miss_count[mycars_miss_count$mod_missing==-1,]$rowid2_fct)
g3 <- g3+
geom_text(x=x_textloc_g3, y=y_textloc_g3, label="complete cases")+
scale_fill_manual(values = c("grey65", "grey80", "mediumpurple1"), guide="none")
} else {
g3 <- g3+scale_fill_manual(values = c("grey80", "mediumpurple1"), guide="none")
}
g1 + plot_spacer() + g3 + g2 +
plot_layout(widths = c(10, 3), heights = c(3, 10))
Hints:
missing_patterns <- data.frame(is.na(mycars)) %>%
group_by_all() %>%
count(name = "count", sort = TRUE) %>%
ungroup()
alpha to control the highlight with scale_alpha_manual(values = ...) or use the ggnewscale package which provides for multiple fill scales in the same graph.mycars is shown below.## argument: data as a matrix or a dataframe
## output: missing value patterns plots
plot_missing <- function(data, percent = FALSE){
library(ggnewscale)
df_miss <- data.frame(is.na(data)) %>%
rownames_to_column("id")
df_miss_count <- df_miss %>%
group_by(across(colnames(df_miss)[-1])) %>%
summarize(miss_count2=n()) %>%
arrange(-miss_count2) %>%
rowid_to_column() %>%
pivot_longer(!c(rowid,miss_count2), names_to = "key", values_to = "missingFLG") %>%
mutate(missing = ifelse(missingFLG=="TRUE", 1, 0)) %>%
mutate(rowid2_fct=factor(rowid)) %>%
rename(rowid2=rowid) %>%
rowid_to_column() %>%
group_by(key) %>%
mutate(miss_count=sum(miss_count2*missing)) %>%
mutate(rowid=min(rowid)) %>%
ungroup() %>%
group_by(rowid2_fct) %>%
mutate(compcases = ifelse(sum(missing)==0, 1, 0)) %>%
ungroup() %>%
mutate(key=factor(key)) %>%
mutate(key=fct_reorder(key,-miss_count-0.5/rowid)) %>%
mutate(mod_missing=factor(missing-compcases)) %>%
mutate(mod_missing=fct_reorder(mod_missing,missing-compcases)) %>%
mutate(rowid2_fct=fct_reorder(rowid2_fct,miss_count2+0.5/rowid2))
df_miss_count_g1 <- df_miss_count %>%
group_by(key) %>%
summarize(miss_count=max(miss_count)) %>%
mutate(miss_percent=10^2*miss_count/nrow(df_miss))
if(percent == TRUE){
g1 <- ggplot(df_miss_count_g1,aes(x = key, y = miss_percent))+
geom_bar(stat = "identity", fill="cornflowerblue", alpha = 0.7)+
labs(x = "", y= "% rows \n missing:")+
scale_y_continuous(breaks=seq(0,100,25),limits=c(0,100))+
theme_bw()+
theme(panel.grid.major.x = element_blank(),
panel.grid.minor = element_line(colour = "grey97", size = 0.05))+
ggtitle("Missing value patterns")+
theme(plot.title = element_text(size = 18))
} else {
g1 <- ggplot(df_miss_count_g1,aes(x = key, y = miss_count))+
geom_bar(stat = "identity", fill="cornflowerblue", alpha = 0.7)+
labs(x = "", y= "num rows \n missing:")+
theme_bw()+
theme(panel.grid.major.x = element_blank(),
panel.grid.minor = element_line(colour = "grey97", size = 0.05))+
ggtitle("Missing value patterns")+
theme(plot.title = element_text(size = 18))
if(max(df_miss_count_g1$miss_count)<10){
g1 <- g1 + scale_y_continuous(breaks=seq(0,10,5),limits=c(0,10))
}
}
df_miss_count_g2 <- df_miss_count %>%
group_by(rowid2_fct) %>%
summarize(miss_count2=max(miss_count2),compcases=min(compcases)) %>%
mutate(miss_percent2=100*miss_count2/sum(miss_count2)) %>%
mutate(compcases_fct=factor(compcases)) %>%
mutate(compcases_fct=fct_reorder(compcases_fct,compcases))
if(percent == TRUE){
g2 <- ggplot(df_miss_count_g2,aes(x = rowid2_fct, y = miss_percent2))+
geom_bar(stat = "identity", aes(alpha = compcases_fct), fill="cornflowerblue")+
scale_alpha_manual(values = c(0.7,1), guide="none")+
labs(x = "", y= "% rows")+
scale_y_continuous(breaks=seq(0,100,25),limits=c(0,100))+
theme_bw() +
theme(panel.grid.major.y = element_blank(),
panel.grid.minor = element_line(colour = "grey97", size = 0.05))+
coord_flip()
} else {
g2 <- ggplot(df_miss_count_g2,aes(x = rowid2_fct, y = miss_count2))+
geom_bar(stat = "identity", aes(alpha = compcases_fct), fill="cornflowerblue")+
scale_alpha_manual(values = c(0.7,1), guide="none")+
labs(x = "", y= "row count")+
theme_bw() +
theme(panel.grid.major.y = element_blank(),
panel.grid.minor = element_line(colour = "grey97", size = 0.05))+
coord_flip()
if(max(df_miss_count_g2$miss_count2)<10){
g2 <- g2 + scale_y_continuous(breaks=seq(0,10,5),limits=c(0,10))
}
}
g3 <- ggplot(df_miss_count, aes(x = key,y = rowid2_fct))+
geom_tile(aes(fill = mod_missing), color = "white")+
labs(x = "variable", y= "missing pattern")+
theme_classic()
if(max(df_miss_count$compcases)==1){
x_textloc_g3 <- length(unique(df_miss_count$key))/2+0.5
y_textloc_g3 <- unique(df_miss_count[df_miss_count$mod_missing==-1,]$rowid2_fct)
g3 <- g3+
geom_text(x=x_textloc_g3, y=y_textloc_g3, label="complete cases")+
scale_fill_manual(values = c("grey65", "grey80", "mediumpurple1"), guide="none")
} else {
g3 <- g3+scale_fill_manual(values = c("grey80", "mediumpurple1"), guide="none")
}
return(
g1 + plot_spacer() + g3 + g2 +
plot_layout(widths = c(10, 3), heights = c(3, 10))
)
}
plot_missing(mycars, percent = TRUE)
You either put the function code in a separate .R file or include it in the .Rmd file.
# Check the percent option for "mycar" data
plot_missing(mycars, percent = TRUE)
economics dataset in the ggplot2 package. (This is a test to see if your function works if there are no missing values.)plot_missing(economics)
plot_missing(economics, percent = TRUE)
HollywoodMovies2011 dataset in the Lock5withR package. You can shorten the column names so they don’t overlap in the plot.Holly <- Lock5withR::HollywoodMovies2011
names(Holly) <- substring(names(Holly), 1, 3)
plot_missing(Holly)
plot_missing(Holly, percent =TRUE)
Set up your final project repository following the EDAVproject template. You can either choose one team member’s GitHub account, or create an organization to house the final project. Be sure to follow all of the steps in the README so your bookdown book renders with your information, not the placeholders in the template. Edit the link below to point to your rendered book:
Make sure that all team members have write access to the repository and have practiced making contributions. Edit the link below to point to your contributors page, showing that all team members have made contributions to the repo (Note that we do not have the ability to see who has write access, only who has contributed):
Discuss a plan for dividing up the work for the final project and briefly summarize what each person will do.
Zhenyu Yuan:
Be mainly responsible for resuls of Q4-Q5 in the Final Project Group Formation and Data Check.
Be responsible for effectively use the method in causal inference for all questions if any (Zhenyu is taking the causal inference class)
Be primarily responsible for an interacted component
Yuki Ikeda:
Be mainly responsible for resuls of Q1-Q3 in the Final Project Group Formation and Data Check.
Be responsible for effectively use the method in statistical tests for all questions if any (Yuki is taking statistical inference and modeling class)
Be primarily responsible for data transformation and their reproducibility
Write a first draft of the missing values chapter of your final project. You do not have to include all of the data you use in the final project. Choose one file and analyze it using techniques discussed in class for missing values. Include a plot using your function from Q2 as well as verbal interpretation of the plot. Edit this link to point to your chapter:
https://Francis-ZY-Yuan.github.io/EDAV-Project/missing-values.html
Though very long at this moment of the first draft, we will fairly select and condense the content towards two graphs and comments as suggested.
Though it seems there’s missing data in our dataset, after digging into the questionnaire, we come to believe that the NAs aren’t really missing. We are dealing with survey data with some jump logics, so not all respondents see all the questions.
For example, in W33, the quesion CLIM2B (“Even if you are not sure, which one of these three statements about the Earth’s temperature comes closest to WHAT MOST CLIMATE SCIENTISTS SAY?”) are only shown if the answer to CLIM2A (“From what you have heard or read, which of these three statements about the Earth’s temperature comes closest to WHAT MOST CLIMATE SCIENTISTS SAY?”) is “not sure” or “refused”. In this case, a NA in CLIM2B simply means that the question is not triggered, and shouldn’t be considered as missing.
As for the answers Refused, they aren’t equivalent to missing either, and they are rare in the variables we care about (mostly less than 1%), so we think it’s fine to ignore them altogether.
The link above shows our analysis on the answer “Refused”. Since technically speaking, refused isn’t the same as missing, so we also chose to do the alternative of analysing another dataset.
As an alternative, I choose to analyse the trumpworld_poll dataset.
The dataset is in wide form, in which a row shows each country’s response to a question. A missing value indicates that the question wasn’t asked in the country in that year. Therefore, I think it’ll suffice to create a heatmap of for each question, which countries wasn’t asked that question in each year.
library(plotly)
library(fivethirtyeight)
library(tidyverse)
library(heatmaply)
empty_plot <- function(title = NULL){
p <- plotly_empty(type = "scatter", mode = "markers") %>%
config(
displayModeBar = FALSE
) %>%
layout(
title = list(
text = title,
yref = "paper",
y = 0.5
)
)
return(p)
}
na_trump = trumpworld_polls %>%
tibble() %>%
mutate(across(!question & !avg & !year, function(x){return(1*is.na(x))})) # Multiply by one to turn bool into numeric
hmap <- function(df, name){
df = df %>% as.data.frame()
row.names(df) <- df$year
df = df %>% select(!year&!avg)
df = df[, order(names(df))]
c = plot_ly(x=colnames(df), y=colSums(df), type="bar")
cc = empty_plot() # Placeholder
r = plot_ly(y=rownames(df), x=rowSums(df), type="bar") %>%
layout(yaxis = list(autorange="reversed") , xaxis=list(title="Count of yellow"))
p = heatmaply(df, Rowv=F, Colv=F, plot_method = "plotly",
main=paste("Question:", name$question,"(Yellow=Not Asked in Country in that year)"),
hide_colorbar=T)
row1 = subplot(c,cc)
row2 = subplot(p, r)
plt = subplot(row1, row2, nrows=2, margin=0.1)
return(plt)
}
plots<-na_trump %>%
group_by(question) %>%
group_map(~hmap(.x, .y))
plots[[1]]
plots[[2]]